library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(ggrepel)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
netflix <- read.csv('netflix.csv', na.strings = c("", "NA"), stringsAsFactors = FALSE)

#check missing data
sum(is.na(netflix))
## [1] 3631
colSums(is.na(netflix))
##      show_id         type        title     director         cast      country 
##            0            0            0         2389          718          507 
##   date_added release_year       rating     duration    listed_in  description 
##           10            0            7            0            0            0
## change categorical variable, from character to factor
netflix$rating <- as.factor(netflix$rating)
netflix$listed_in <- as.factor(netflix$listed_in)
netflix$type <- as.factor(netflix$type)
netflix$date_added2 <- mdy(netflix$date_added)#convert the date the movie was added to date time 
 
netflix <- netflix %>%  #create a column for the month and week day
  mutate(month = month(date_added2, label = TRUE, abbr = FALSE),
         day = wday(date_added2, label = TRUE, abbr = FALSE))
netflix$type <- factor(netflix$type, 
                       levels = c("TV Show", "Movie"))
netflix$month<- factor(netflix$month, 
                       levels = c("January", "February", 
                                  "March", "April", "May",
                                  "June", "July", "August",
                                  "September", "October",
                                  "November", "December"))
netflix$day <- factor(netflix$day, 
                      levels = c("Monday", "Tuesday", 
                                 "Wednesday", "Thursday",
                                 "Friday", "Saturday", 
                                 "Sunday"))

#plot showing months with highest release
netflix %>%
  filter(!is.na(month)) %>%
  group_by(month) %>%
  summarize(count = n()) %>% 
  ggplot(aes(x =reorder(month, count), y = count, fill = month)) +
  geom_bar(stat = "identity") +
  xlab("months of the year") +
  ylab("Number of Movies") +
  ggtitle("Number of movies released by year") +
  coord_flip()

netflix %>% 
  filter(!is.na(day)) %>% 
  group_by(day) %>% 
  summarize(count = n()) %>% 
  ggplot(aes(x = reorder(day, count), y = count, fill = day)) +
  geom_bar(stat = "identity") +
  xlab("days of the week") +
  ylab("Number of Movies") +
  ggtitle("Number of movies released by days of the week") +
  coord_flip()

##change the date format 
head(netflix$date_added) 
## [1] "August 14, 2020"   "December 23, 2016" "December 20, 2018"
## [4] "November 16, 2017" "January 1, 2020"   "July 1, 2017"
netflix$date_added <- mdy(netflix$date_added)
head(netflix$date_added) 
## [1] "2020-08-14" "2016-12-23" "2018-12-20" "2017-11-16" "2020-01-01"
## [6] "2017-07-01"
head(netflix)
##   show_id    type title          director
## 1      s1 TV Show    3%              <NA>
## 2      s2   Movie  7:19 Jorge Michel Grau
## 3      s3   Movie 23:59      Gilbert Chan
## 4      s4   Movie     9       Shane Acker
## 5      s5   Movie    21    Robert Luketic
## 6      s6 TV Show    46       Serdar Akar
##                                                                                                                                                                         cast
## 1 João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Valente, Vaneza Oliveira, Rafael Lozano, Viviane Porto, Mel Fronckowiak, Sergio Mamberti, Zezé Motta, Celso Frateschi
## 2                                                                                   Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato
## 3                                                               Tedd Chan, Stella Chung, Henley Hii, Lawrence Koh, Tommy Kuan, Josh Lai, Mark Lee, Susan Leong, Benjamin Lim
## 4                            Elijah Wood, John C. Reilly, Jennifer Connelly, Christopher Plummer, Crispin Glover, Martin Landau, Fred Tatasciore, Alan Oppenheimer, Tom Kane
## 5            Jim Sturgess, Kevin Spacey, Kate Bosworth, Aaron Yoo, Liza Lapira, Jacob Pitts, Laurence Fishburne, Jack McGee, Josh Gad, Sam Golzari, Helen Carey, Jack Gilpin
## 6                            Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan, Saygın Soysal, Berkan Şal, Metin Belgin, Ayça Eren, Selin Uludoğan, Özay Fecht, Suna Yıldızoğlu
##         country date_added release_year rating  duration
## 1        Brazil 2020-08-14         2020  TV-MA 4 Seasons
## 2        Mexico 2016-12-23         2016  TV-MA    93 min
## 3     Singapore 2018-12-20         2011      R    78 min
## 4 United States 2017-11-16         2009  PG-13    80 min
## 5 United States 2020-01-01         2008  PG-13   123 min
## 6        Turkey 2017-07-01         2016  TV-MA  1 Season
##                                                  listed_in
## 1   International TV Shows, TV Dramas, TV Sci-Fi & Fantasy
## 2                             Dramas, International Movies
## 3                      Horror Movies, International Movies
## 4 Action & Adventure, Independent Movies, Sci-Fi & Fantasy
## 5                                                   Dramas
## 6          International TV Shows, TV Dramas, TV Mysteries
##                                                                                                                                             description
## 1              In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor.
## 2  After a devastating earthquake hits Mexico City, trapped survivors from all walks of life wait to be rescued while trying desperately to stay alive.
## 3 When an army recruit is found dead, his fellow soldiers are forced to confront a terrifying secret that's haunting their jungle island training camp.
## 4     In a postapocalyptic world, rag-doll robots hide in fear from dangerous machines out to exterminate them, until a brave newcomer joins the group.
## 5       A brilliant group of students become card-counting experts with the intent of swindling millions out of Las Vegas casinos by playing blackjack.
## 6 A genetics professor experiments with a treatment for his comatose sister that blends medical and shamanic cures, but unlocks a shocking side effect.
##   date_added2    month       day
## 1  2020-08-14   August    Friday
## 2  2016-12-23 December    Friday
## 3  2018-12-20 December  Thursday
## 4  2017-11-16 November  Thursday
## 5  2020-01-01  January Wednesday
## 6  2017-07-01     July  Saturday
summary(netflix)
##    show_id               type         title             director        
##  Length:7787        TV Show:2410   Length:7787        Length:7787       
##  Class :character   Movie  :5377   Class :character   Class :character  
##  Mode  :character                  Mode  :character   Mode  :character  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##      cast             country            date_added          release_year 
##  Length:7787        Length:7787        Min.   :2008-01-01   Min.   :1925  
##  Class :character   Class :character   1st Qu.:2018-02-01   1st Qu.:2013  
##  Mode  :character   Mode  :character   Median :2019-03-08   Median :2017  
##                                        Mean   :2019-01-02   Mean   :2014  
##                                        3rd Qu.:2020-01-20   3rd Qu.:2018  
##                                        Max.   :2021-01-16   Max.   :2021  
##                                        NA's   :10                         
##      rating       duration        
##  TV-MA  :2863   Length:7787       
##  TV-14  :1931   Class :character  
##  TV-PG  : 806   Mode  :character  
##  R      : 665                     
##  PG-13  : 386                     
##  (Other):1129                     
##  NA's   :   7                     
##                                             listed_in    description       
##  Documentaries                                   : 334   Length:7787       
##  Stand-Up Comedy                                 : 321   Class :character  
##  Dramas, International Movies                    : 320   Mode  :character  
##  Comedies, Dramas, International Movies          : 243                     
##  Dramas, Independent Movies, International Movies: 215                     
##  Kids' TV                                        : 205                     
##  (Other)                                         :6149                     
##   date_added2              month             day      
##  Min.   :2008-01-01   December: 833   Friday   :2287  
##  1st Qu.:2018-02-01   October : 785   Thursday :1147  
##  Median :2019-03-08   January : 757   Tuesday  :1070  
##  Mean   :2019-01-02   November: 738   Wednesday:1020  
##  3rd Qu.:2020-01-20   March   : 669   Monday   : 814  
##  Max.   :2021-01-16   (Other) :3995   (Other)  :1439  
##  NA's   :10           NA's    :  10   NA's     :  10
glimpse(netflix)
## Rows: 7,787
## Columns: 15
## $ show_id      <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
## $ type         <fct> TV Show, Movie, Movie, Movie, Movie, TV Show, Movie, Movi…
## $ title        <chr> "3%", "7:19", "23:59", "9", "21", "46", "122", "187", "70…
## $ director     <chr> NA, "Jorge Michel Grau", "Gilbert Chan", "Shane Acker", "…
## $ cast         <chr> "João Miguel, Bianca Comparato, Michel Gomes, Rodolfo Val…
## $ country      <chr> "Brazil", "Mexico", "Singapore", "United States", "United…
## $ date_added   <date> 2020-08-14, 2016-12-23, 2018-12-20, 2017-11-16, 2020-01-…
## $ release_year <int> 2020, 2016, 2011, 2009, 2008, 2016, 2019, 1997, 2019, 200…
## $ rating       <fct> TV-MA, TV-MA, R, PG-13, PG-13, TV-MA, TV-MA, R, TV-14, TV…
## $ duration     <chr> "4 Seasons", "93 min", "78 min", "80 min", "123 min", "1 …
## $ listed_in    <fct> "International TV Shows, TV Dramas, TV Sci-Fi & Fantasy",…
## $ description  <chr> "In a future where the elite inhabit an island paradise f…
## $ date_added2  <date> 2020-08-14, 2016-12-23, 2018-12-20, 2017-11-16, 2020-01-…
## $ month        <ord> August, December, December, November, January, July, June…
## $ day          <ord> Friday, Friday, Thursday, Thursday, Wednesday, Saturday, …
table(netflix$type)
## 
## TV Show   Movie 
##    2410    5377
netflix %>%
  count(type) %>%
  
  ggplot() + 
  geom_col(aes(x = type, y = n, fill = type)) +
  labs(title = "Show Types") +
  theme_minimal()

###
netflix %>% count(type, sort = T) %>%
  
  mutate(prop = paste0(round(n / sum(n) * 100, 0), "%")) %>%
  ggplot(aes(x = "", y = prop, fill = type)) +
  geom_bar(
    stat = "identity",
    width = 1,
    color = "steelblue",
    size = 1
  ) +
  coord_polar("y", start = 0) +
  geom_text(
    aes(y = prop, label = prop),
    position = position_stack(vjust = 0.5),
    size = 6,
    col = "white",
    fontface = "bold"
  ) +
  scale_fill_manual (values = c('#e41a1c', '#377eb8')) +
  theme_void() +
  labs(
    title =  'Proportion of Movies to TV shows',
    fill = ""
  )

# netflix %>% filter(title=="Black Mirror: Bandersnatch")
library(broom)

movies<- netflix %>% select(country, type, duration, rating, title) %>%
  filter(type == "Movie") %>%
  drop_na() %>% 
  mutate(duration_min = parse_number(duration))
movies %>% 
  select(title, duration_min) %>% 
  filter(duration_min > 200) %>% 
  arrange(desc(duration_min)) 
##                                            title duration_min
## 1                     Black Mirror: Bandersnatch          312
## 2                         The School of Mischief          253
## 3                                 No Longer kids          237
## 4                                         Sangam          228
## 5                                         Lagaan          224
## 6                                   Jodhaa Akbar          214
## 7                       Kabhi Khushi Kabhie Gham          209
## 8                                   The Irishman          209
## 9                   No Direction Home: Bob Dylan          208
## 10                            The Gospel of Luke          205
## 11                          What's Your Raashee?          203
## 12 The Lord of the Rings: The Return of the King          201
movies %>% 
  select(title, duration_min) %>% 
  arrange(desc(duration_min)) %>% 
  ggplot () + 
  geom_histogram(aes(x=duration_min),fill='dark red') +
  labs(title='Distribution of Movie Duration')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###
tv_show <- netflix %>% select(country, type, duration, rating, title) %>%
  filter(type == "TV Show") %>% 
  drop_na() %>% 
  mutate(duration_season = parse_number(duration))
tv_show %>% 
  select(title, duration_season) %>% 
  filter(duration_season > 10) %>% 
  arrange(desc(duration_season))
##                     title duration_season
## 1          Grey's Anatomy              16
## 2                    NCIS              15
## 3            Supernatural              15
## 4  COMEDIANS of the world              13
## 5            Red vs. Blue              13
## 6          Criminal Minds              12
## 7       Trailer Park Boys              12
## 8                  Cheers              11
## 9                 Frasier              11
## 10              Heartland              11
tv_show %>% 
  select(title, duration_season) %>% 
  arrange(desc(duration_season))%>%  
  ggplot () + 
  geom_histogram(aes(x=duration_season),fill='dark blue',size=3) +
  labs(title='Distribution of TV Shows Duration') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tv_show %>% select(duration_season) %>%
  count(duration_season, sort = TRUE) %>%
  ggplot(aes(
    x = as.factor(duration_season),
    y = n,
    label = n
  )) +
  geom_col(aes(fill = duration_season)) +
  geom_text(vjust = -0.5, size = 3, col = "darkblue") +
  theme_light() +
  theme(legend.position = "none") +
  labs(x = "Season duration",
       y = "Count",
    title = "Distribution of TV Shows Duration",
    fill = ""
  )

####
netflix %>%
  filter(type == 'Movie') %>%
  mutate(duration_in_mins=parse_number(duration))%>%
  summarise(mean_duration=mean(duration_in_mins)) 
##   mean_duration
## 1      99.30798
netflix %>%
  filter(type == 'Movie') %>%
  mutate(duration = parse_number(duration)) %>%
  summarise(mean_duration = mean(duration),
           median_duration = median(duration))
##   mean_duration median_duration
## 1      99.30798              98
netflix %>%
  filter(type == 'Movie') %>%
  mutate(duration = parse_number(duration)) %>%
  ggplot () + 
  geom_histogram (aes(x=duration),fill='dark blue') +
  labs(title='Distribution of Movie Duration') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

netflix %>%
  filter(!str_detect(country,',')) %>%
  group_by(country) %>%
  count() %>%
  arrange(desc(n)) %>%
  head(20) %>%
  ggplot() + geom_col(aes(y = reorder(country,n), x = n, fill=reorder(country,n)), width = 0.4) +
  geom_label(aes(y = reorder(country,n), x = n, label = n)) +
  labs(title = 'Number of Shows of each Country') 

#netflix$date_added <- as.Date(netflix$date_added, format = "%B %d, %Y")

# Number of shows available in Netflix as a function of time
df_by_date <- netflix %>% 
  group_by(date_added,type) %>% 
  summarise(addedToday = n()) %>% 
  ungroup() %>% group_by(type) %>% 
  mutate(Total_Number_of_Shows = cumsum(addedToday), 
         label = if_else(date_added == max(date_added,na.rm = T), 
                         as.character(type), NA_character_))
## `summarise()` has grouped output by 'date_added'. You can override using the `.groups` argument.
df_by_date  %>% ggplot(aes(x = date_added, y = Total_Number_of_Shows, color = type)) + 
  geom_line() + 
  theme_bw() + 
  scale_x_date(date_breaks = '2 years', date_labels = "%Y") + 
  theme(legend.position = 'none') + 
  geom_text_repel(aes(label = label), size = 8,na.rm = TRUE, nudge_y = 100)+
  labs(title = "Number of Shows in Netflix by Time",
       x = "years", y = "number of shows") +
  theme_minimal()
## Warning: Removed 1 row(s) containing missing values (geom_path).

Which countries are producing most shows ?

netflix %>% 
  filter(country!='NA') %>%
  group_by(type) %>% 
  mutate(country = fct_infreq(country))%>%
  count(country)
## # A tibble: 773 x 3
## # Groups:   type [2]
##    type    country            n
##    <fct>   <fct>          <int>
##  1 TV Show United States    705
##  2 TV Show United Kingdom   204
##  3 TV Show Japan            157
##  4 TV Show South Korea      147
##  5 TV Show India             71
##  6 TV Show Taiwan            68
##  7 TV Show Canada            59
##  8 TV Show Australia         46
##  9 TV Show France            46
## 10 TV Show Spain             45
## # … with 763 more rows
netflix %>% 
  filter(country!='NA') %>%
  group_by(type) %>% 
  mutate(country = fct_infreq(country)) %>% 
  
  ggplot(aes(x = country, fill=type)) + 
  geom_histogram(stat = 'count') + 
  facet_wrap(~type, scales = 'free_x') + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  coord_cartesian(xlim = c(1,10)) + 
  scale_x_discrete(labels = function(x){str_wrap(x,20)}, breaks = function(x) {x[1:10]})
## Warning: Ignoring unknown parameters: binwidth, bins, pad

##From the above, we can see that: After United States, India is the largest source of Movies listed on Netflix, and UK is the third. While many of the Indian movies ended up on Netflix, it looks like there are not as many Indian TV shows. A large number of listings also seem to have missing country information, which could possibly be inferred from the cast etc.
show_categories <- netflix %>% 
  select(c('show_id','type','listed_in')) %>% 
  separate_rows(listed_in, sep = ',') %>%
  rename(Show_Category = listed_in)

show_categories$Show_Category <- trimws(show_categories$Show_Category)
head(show_categories)
## # A tibble: 6 x 3
##   show_id type    Show_Category         
##   <chr>   <fct>   <chr>                 
## 1 s1      TV Show International TV Shows
## 2 s1      TV Show TV Dramas             
## 3 s1      TV Show TV Sci-Fi & Fantasy   
## 4 s2      Movie   Dramas                
## 5 s2      Movie   International Movies  
## 6 s3      Movie   Horror Movies
show_categories %>% 
  mutate(Show_Category = fct_infreq(Show_Category)) %>% 
  group_by(type)%>%
  count(Show_Category)
## # A tibble: 42 x 3
## # Groups:   type [2]
##    type    Show_Category              n
##    <fct>   <fct>                  <int>
##  1 TV Show International TV Shows  1199
##  2 TV Show TV Dramas                704
##  3 TV Show TV Comedies              525
##  4 TV Show Crime TV Shows           427
##  5 TV Show Kids' TV                 414
##  6 TV Show Docuseries               353
##  7 TV Show Romantic TV Shows        333
##  8 TV Show British TV Shows         232
##  9 TV Show Reality TV               222
## 10 TV Show Korean TV Shows          150
## # … with 32 more rows
show_categories %>% 
  mutate(Show_Category = fct_infreq(Show_Category)) %>% 
  
  ggplot(aes(x = Show_Category, fill=type)) + 
  geom_bar() + 
  scale_x_discrete() + 
  facet_wrap(~type, scales = 'free_x') + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme() + coord_cartesian(xlim = c(1,20))

##############
df_unique_categories <- show_categories %>% 
  group_by(type,Show_Category) %>%  
  summarise()
## `summarise()` has grouped output by 'type'. You can override using the `.groups` argument.
df_category_correlations_movies <- 
  data.frame(expand_grid(type = 'Movie', 
                         Category1 = subset(df_unique_categories, type == 'Movie')$Show_Category,
                         Category2 = subset(df_unique_categories, type == 'Movie')$Show_Category))
                                  
df_category_correlations_TV <- 
  data.frame(expand_grid(type = 'TV Show',
                         Category1 = subset(df_unique_categories, type == 'TV Show')$Show_Category,
                         Category2 = subset(df_unique_categories, type == 'TV Show')$Show_Category))
                                 
df_category_correlations <- rbind(df_category_correlations_movies,df_category_correlations_TV)
df_category_correlations$matched_count <- 
  apply(df_category_correlations, MARGIN = 1,FUN = function(x) 
  {
    length(intersect(subset(show_categories,type == x['type'] & Show_Category == x['Category1'])$show_id,
                     subset(show_categories, type == x['type'] & Show_Category == x['Category2'])$show_id))
    })

df_category_correlations <- 
  subset(df_category_correlations, (as.character(Category1) < as.character(Category2)) & (matched_count > 0))

# Change plot size to 8 x 3
options(repr.plot.width=14, repr.plot.height=10)

ggplot(subset(df_category_correlations, type == 'Movie'), 
       aes(x = Category1, y = Category2, fill = matched_count)) + 
  geom_tile() + facet_wrap( ~type, scales = 'free') +   
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  + 
  scale_fill_distiller(palette = "Spectral") + 
  theme(legend.text = element_text(size = 14), legend.title = element_text(size = 16))

netflix %>%
 select(listed_in) %>%
 mutate(listed_in = str_split(listed_in,',')) %>%
 unnest(listed_in) %>%
 group_by(listed_in) %>%
 count() %>%
 arrange(desc(n)) %>%
 head(30)
## # A tibble: 30 x 2
## # Groups:   listed_in [30]
##    listed_in                    n
##    <chr>                    <int>
##  1 " International Movies"   2323
##  2 "Dramas"                  1384
##  3 "Comedies"                1074
##  4 "Documentaries"            751
##  5 " Dramas"                  722
##  6 "Action & Adventure"       721
##  7 "International TV Shows"   690
##  8 " Independent Movies"      653
##  9 " TV Dramas"               642
## 10 " Romantic Movies"         528
## # … with 20 more rows
netflix %>%
 select(listed_in) %>%
 mutate(listed_in = str_split(listed_in,',')) %>%
 unnest(listed_in) %>%
 group_by(listed_in) %>%
 count() %>%
 arrange(desc(n)) %>%
 head(30) %>%
 ggplot() + geom_col(aes(y = reorder(listed_in,n), x = n, fill=n)) +
 labs(title = 'Category of Movie and TV Shows',
     x = 'Count',
     y = 'Genre') +
  geom_label(aes(y = reorder(listed_in,n), x = n, label = n),size=2) +
theme_minimal()

netflix %>% 
  select(c('show_id','cast','director')) %>% 
  gather(key = 'role', value = 'person', cast, director) %>% 
  filter(person != "") %>% 
  separate_rows(person, sep = ',') -> show_people

show_people$person <- trimws(show_people$person)
head(show_people)
## # A tibble: 6 x 3
##   show_id role  person          
##   <chr>   <chr> <chr>           
## 1 s1      cast  João Miguel     
## 2 s1      cast  Bianca Comparato
## 3 s1      cast  Michel Gomes    
## 4 s1      cast  Rodolfo Valente 
## 5 s1      cast  Vaneza Oliveira 
## 6 s1      cast  Rafael Lozano
people_freq <- show_people %>% 
  group_by(person,role) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))
## `summarise()` has grouped output by 'person'. You can override using the `.groups` argument.
people_freq %>% 
  group_by(role) %>% 
  top_n(10,count) %>% 
  ungroup() %>% 
  ggplot(aes(x = fct_reorder(person,count,.desc = T), y = count, fill = role)) + 
  geom_bar(stat = 'identity') + 
  scale_x_discrete() + facet_wrap(~role, scales = 'free_x') + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme(legend.position = 'none') + labs(x = 'Name of the actor / director')

netflix  %>% 
  count(release_year) %>%
  arrange(desc(n)) %>%
  ggplot() +
  geom_col(aes(x = release_year, y = n,color=n))

#Where do the majority of movies available on Netflix come from?

sum(is.na(netflix$country))
## [1] 507
# split the column of countries at the comma and then unest it into multiple rows
 df_country <- netflix %>% 
    filter(country != 'NA') %>%
    mutate(country = strsplit(as.character(country), ",")) %>% 
    unnest(country) %>%
      mutate(country = trimws(country, which = c("left"))) #eliminate space on the left side
 
 df_country <- df_country %>%
  group_by(country)%>%
    add_tally()
  
 df_country <- df_country%>%
   select(country,n,type) %>%
   unique()
 df_country_top5 <- df_country[order(-df_country$n),]
 

 
 df_country_top5 <- df_country_top5[1:35,]
 df_country_top5
## # A tibble: 35 x 3
## # Groups:   country [18]
##    country            n type   
##    <chr>          <int> <fct>  
##  1 United States   3297 Movie  
##  2 United States   3297 TV Show
##  3 India            990 Movie  
##  4 India            990 TV Show
##  5 United Kingdom   723 Movie  
##  6 United Kingdom   723 TV Show
##  7 Canada           412 Movie  
##  8 Canada           412 TV Show
##  9 France           349 Movie  
## 10 France           349 TV Show
## # … with 25 more rows
 ggplot(df_country_top5, aes(x = reorder(country, n), y = n, fill = type))+
   geom_bar(stat = "identity")+
   coord_flip()+
   theme_classic()+
  theme(axis.title.x = element_blank(),
          axis.title.y = element_blank())+
  labs(title="Content available per country", x = "Amount of content")

##What are the most frequent words used in movie titles?

library(tokenizers)
library(wordcloud)
## Loading required package: RColorBrewer
# 
# tot_title <- paste(netflix[,3],collapse=" ")
# tot_title_words <-  tokenize_words(tot_title)
# words.freq<-table(unlist(tot_title_words))
# 
# result <- cbind.data.frame(words = names(words.freq),amount = as.integer(words.freq)) ## You might consider using cbind.data.frame instead of cbind
# 
# result_dec <- result[order(-result$amount),]
# 
# result_dec_filter <- result_dec %>%
#   filter(nchar( as.character(words)) > 3)
# 
# wordcloud(words = result_dec_filter$word, freq = result_dec_filter$amount, min.freq = 1,  max.words=150, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Set2"))
library(tokenizers)
library(wordcloud)
library(tidytext)
desc_words_m <- netflix %>% select(type, show_id, description) %>%
  filter(type == "Movie") %>% 
    unnest_tokens(word, description) %>%
    anti_join(stop_words)
## Joining, by = "word"
count_word <- desc_words_m %>% count(word, sort = TRUE)
wordcloud(words = count_word$word,  
          freq = count_word$n, 
          min.freq = 55,  
          max.words = nrow(count_word), 
          random.order = FALSE,  
          rot.per = 0.1,  
          colors = brewer.pal(8, "Dark2")) 

#Life, Women, Love, Friends, Family, Home, world
desc_words_tv <- netflix %>% select(type, show_id, description) %>%
  filter(type == "TV Show") %>% 
    unnest_tokens(word, description) %>%
    anti_join(stop_words)
## Joining, by = "word"
count_word <- desc_words_tv %>%
   count(word, sort = TRUE)


wordcloud(words = count_word$word,  
          freq = count_word$n, 
          min.freq = 30,  
          max.words = nrow(count_word), 
          random.order = FALSE,  
          rot.per = 0.1,  
          colors = brewer.pal(8, "Dark2")) 

#World, Life, Love, Lives, Friends, Family, School are the most frequent words
# titles=str_flatten(netflix[,3],collapse = '')
# titles_split=tokenize_words(titles)
# word_freq=table(unlist(titles_split))
# 
# result=cbind.data.frame(words=names(word_freq),
#                         word_count=as.integer(word_freq))
# result_top=result%>%arrange(desc(word_count))%>%
#            filter(nchar(as.character(words))>3)
# 
# cloud=wordcloud(words = result_top$words,freq = result_top$word_count,min.freq = 1,
#               max.words = 200,random.order = F,
#               rot.per =0.35, colors=brewer.pal(8,"Dark2"))
##Rating by Type

r<-netflix %>% select(rating, type) %>%
  filter(!is.na(rating)) %>%
  mutate(rating = fct_lump(rating, 5)) %>%
  group_by(rating, type) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count))
## `summarise()` has grouped output by 'rating'. You can override using the `.groups` argument.
r %>% filter(type=='Movie')
## # A tibble: 6 x 3
## # Groups:   rating [6]
##   rating type  Count
##   <fct>  <fct> <int>
## 1 TV-MA  Movie  1845
## 2 TV-14  Movie  1272
## 3 Other  Movie   701
## 4 R      Movie   663
## 5 TV-PG  Movie   505
## 6 PG-13  Movie   386
r %>% filter(type=='TV Show')
## # A tibble: 5 x 3
## # Groups:   rating [5]
##   rating type    Count
##   <fct>  <fct>   <int>
## 1 TV-MA  TV Show  1018
## 2 TV-14  TV Show   659
## 3 Other  TV Show   428
## 4 TV-PG  TV Show   301
## 5 R      TV Show     2
r %>% 
  ggplot(aes(x = type, y = Count, fill = rating)) + 
  geom_bar(stat = 'identity') 

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
netflix %>% select(rating, type) %>%
  filter(!is.na(rating)) %>%
  mutate(rating = fct_lump(rating, 5)) %>%
  group_by(rating, type) %>%
  summarise(Count = n()) %>%
  arrange(Count) %>%
  plot_ly(
    x = ~ type ,
    y = ~ Count,
    type = "bar",
    color = ~ rating,
    text = ~ Count,
    textposition = 'outside',
    textfont = list(color = '#000000', size = 12)
  ) %>%
  layout(yaxis = list(categoryorder = "array",
                      categoryarray = ~ Count)) %>%
  layout(
    title = "Rating by Type",
    yaxis = list(title = "Type"),
    xaxis = list(title = "Count"),
    legend = list(title = list(text = '<b> Rating </b>'))
  )
## `summarise()` has grouped output by 'rating'. You can override using the `.groups` argument.
k <- strsplit(netflix$country, split = ", ")

netds_countries<- data.frame(type = rep(netflix$type, sapply(k, length)), country = unlist(k))
netds_countries$country <- as.character(netds_countries$country)

amount_by_country <- na.omit(netds_countries) %>%
  group_by(country, type) %>%
  summarise(count = n())
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
u <- reshape(data=data.frame(amount_by_country),idvar="country",
                          v.names = "count",
                          timevar = "type",
                          direction="wide") %>% arrange(desc(count.Movie)) %>%
                          top_n(10)
## Selecting by count.TV Show
names(u)[2] <- "Number_of_Movies"
names(u)[3] <- "Number_of_TV_Shows"
u <- u[order(desc(u$Number_of_Movies +u$Number_of_TV_Shows)),]

library(ggplot2)
figure000 <- ggplot(u, aes(Number_of_Movies, Number_of_TV_Shows, colour=country))+ 
  geom_point(size=5)+
  xlab("Number of Movies") + ylab("Number of TV Shows")+
  ggtitle("Amount of Netflix Content By Top 10 Country")
figure000